POSTAL_TO_STATE = list('AL'='Alabama', 'AK'='Alaska', 'AS'='American Samoa',
'AZ'='Arizona', 'AR'='Arkansas', 'CA'='California',
'CO'='Colorado', 'CT'='Connecticut', 'DE'='Delaware',
'DC'='District of Columbia', 'FL'='Florida',
'GA'='Georgia', 'GU'='Guam', 'HI'='Hawaii',
'ID'='Idaho', 'IL'='Illinois', 'IN'='Indiana',
'IA'='Iowa', 'KS'='Kansas', 'KY'='Kentucky',
'LA'='Louisiana', 'ME'='Maine', 'MD'='Maryland',
'MA'='Massachusetts', 'MI'='Michigan', 'MN'='Minnesota',
'MS'='Mississippi', 'MO'='Missouri', 'MT'='Montana',
'NE'='Nebraska', 'NV'='Nevada', 'NH'='New Hampshire',
'NJ'='New Jersey', 'NM'='New Mexico', 'NY'='New York',
'NC'='North Carolina', 'ND'='North Dakota',
'MP'='Northern Mariana Islands', 'OH'='Ohio',
'OK'='Oklahoma', 'OR'='Oregon', 'PA'='Pennsylvania',
'PR'='Puerto Rico', 'RI'='Rhode Island', 'SC'='South Carolina',
'SD'='South Dakota', 'TN'='Tennessee',
'TX'='Texas', 'UT'='Utah', 'VT'='Vermont', 'VI'='Virgin Islands',
'VA'='Virginia', 'WA'='Washington', 'WV'='West Virginia',
'WI'='Wisconsin', 'WY'='Wyoming')
states = c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", "hi",
"id", "il", "in", "ia", "ks", "ky", "la", "me", "md", "ma", "mi",
"mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", "nm", "ny", "nc",
"nd", "oh", "ok", "or", "pa", "ri", "sc", "sd", "tn", "tx", "ut",
"vt", "va", "wa", "wv", "wi", "wy")
BASE_DAILY_URL = paste0(
'https://raw.githubusercontent.com/google-research/open-covid-19-data/',
'master/data/exports/search_trends_symptoms_dataset/',
'United%20States%20of%20America/subregions/{state}/',
'2020_US_{state_underscore}_daily_symptoms_dataset.csv')
cache_data_list = list()
signal_description_df = tribble(
~signal, ~description,
'Podalgia', 'pain in the foot',
'Anosmia', 'loss of smell',
'Purpura', "red/purple skin spots; 'blood spots'",
'Radiculopathy', 'pinched nerve',
'Ageusia', 'loss of taste',
'Erythema chronicum migrans', 'expanding rash early in lyme disease',
'Photodermatitis', 'allergic rash that reqs light',
)
expand_state_name = function(state) {
state_name = POSTAL_TO_STATE[[str_to_upper(state)]]
return(state_name)
}
load_state_data = function(state) {
if (state %in% names(cache_data_list)) return (cache_data_list[[state]])
# Check whether there is a cached version
state_fname = sprintf('cache/%s.csv', state)
# if there isn't, then download
if (!file.exists(state_fname)) {
state_name = expand_state_name(state)
message(sprintf('Downloading data for %s...', state_name))
state_name_underscore = str_replace_all(state_name, ' ', '_')
STATE_DAILY_URL = str_replace_all(BASE_DAILY_URL,
fixed('{state}'), state_name)
STATE_DAILY_URL = str_replace_all(STATE_DAILY_URL,
fixed('{state_underscore}'),
state_name_underscore)
STATE_DAILY_URL = str_replace_all(STATE_DAILY_URL,
fixed(' '),
'%20')
download.file(STATE_DAILY_URL, state_fname)
}
single_state = readr::read_csv(state_fname)
cache_data_list[[state]] <<- single_state
return (single_state)
}
pull_data_state = function(state, symptom) {
single_state = load_state_data(state)
unique(single_state$sub_region_2_code)
single_state_counties = single_state[!is.na(single_state$sub_region_2_code),]
selected_symptom = paste0('symptom:', symptom)
single_state_symptom = single_state_counties[,c('sub_region_2_code',
'date',
selected_symptom)]
# Shape into what we want
colnames(single_state_symptom) = c('geo_value', 'time_value', 'value')
single_state_symptom = single_state_symptom %>% filter (
!is.na(value),
)
single_state_symptom = single_state_symptom %>% transmute (
geo_value = sprintf('%05d', as.numeric(geo_value)),
signal = symptom,
time_value = time_value,
direction = NA,
issue = lubridate::today(),
lag = issue - time_value,
value = value,
stderr = NA,
sample_size = NA,
data_source = 'google_symptoms',
)
}
if (file.exists('symptom_df.RDS')) {
symptom_df = readRDS('symptom_df.RDS')
symptom_names = unique(symptom_df$signal)
} else {
dir.create('./cache/')
ak = load_state_data('ak')
symptom_cols = colnames(ak)[
str_detect(colnames(ak), 'symptom:')]
symptom_names = str_replace(symptom_cols, fixed('symptom:'), '')
symptom_df_list = vector('list', length(symptom_names))
names(symptom_df_list) = symptom_names
for (symptom in symptom_names) {
cat(symptom, '...\n')
states_list = vector('list', length(states))
for (idx in 1:length(states)) {
state = states[idx]
states_list[[idx]] = pull_data_state(state, symptom)
}
symptom_df_list[[symptom]] = bind_rows(states_list)
}
symptom_df = bind_rows(symptom_df_list)
saveRDS(symptom_df, 'symptom_df.RDS')
}
start_day = "2020-03-01"
end_day = "2020-09-15"
df_inum = covidcast_signal(data_source = "jhu-csse",
signal = "confirmed_7dav_incidence_num",
start_day = start_day, end_day = end_day)
case_num = 500
geo_values = df_inum %>% group_by(geo_value) %>%
summarize(total = sum(value)) %>%
filter(total >= case_num) %>% pull(geo_value)
df_inum_act = df_inum %>% filter(geo_value %in% geo_values)
symptom_df_act = symptom_df %>% filter (
geo_value %in% geo_values,
)
Here we plot the availaibility of each symptom over time (proportion is percentage of counties for which the symptom was available). We see that for each signal, the availability level is consistent over time, subject to a strong weekend effect.
availability_df = symptom_df_act %>% group_by (
time_value,
signal,
) %>% summarize (
prop_available = n() / length(geo_values),
) %>% ungroup (
)
## `summarise()` regrouping output by 'time_value' (override with `.groups` argument)
plt = (ggplot(availability_df)
+ geom_line(aes(x=time_value,
y=prop_available,
group=factor(signal)),
color='dodgerblue4',
size=0.1)
)
plt
The symptoms for which data is most sparse are:
most_missing = availability_df %>% group_by (
signal,
) %>% summarize (
avg_available = mean(prop_available)
) %>% ungroup (
) %>% filter (
avg_available <= 0.05
) %>% arrange (
avg_available,
)
## `summarise()` ungrouping output (override with `.groups` argument)
print(most_missing)
## # A tibble: 24 x 2
## signal avg_available
## <chr> <dbl>
## 1 Viral pneumonia 0.0219
## 2 Aphonia 0.0266
## 3 Crackles 0.0281
## 4 Burning Chest Pain 0.0291
## 5 Urinary urgency 0.0296
## 6 Polydipsia 0.0297
## 7 Photodermatitis 0.0305
## 8 Shallow breathing 0.0331
## 9 Dysautonomia 0.0332
## 10 Allergic conjunctivitis 0.0346
## # … with 14 more rows
For the signal that is most sparsely available, the number of counties at which it tends to be available daily is:
print(min(most_missing$avg_available) * length(geo_values))
## [1] 28.94071
Based on this, we leave all the symptoms in for the full correlations analysis.
cor_list = vector('list', length(symptom_names))
names(cor_list) = symptom_names
if (file.exists('cor_df.RDS')) {
cor_df = readRDS('cor_df.RDS')
} else {
for (symptom in symptom_names) {
cat(symptom, '...\n')
df_cor1 = covidcast_cor(symptom_df_act %>% filter(signal == symptom),
df_inum_act,
by = "time_value",
method = "spearman")
df_cor1['signal'] = symptom
cor_list[[symptom]] = df_cor1
}
cor_df = bind_rows(cor_list)
saveRDS(cor_df, 'cor_df.RDS')
}
cor_df = cor_df %>% left_join(
signal_description_df,
on='signal',
)
## Joining, by = "signal"
## Warning: Removed 2532 row(s) containing missing values (geom_path).
When we discuss the “size” of a correlation, we consider the absolute value of correlation.
top_cor_signals = plot_cor_df %>% group_by (
signal,
) %>% filter (
abs(value) == max(abs(value), na.rm=TRUE),
) %>% ungroup (
) %>% arrange(
-abs(value),
) %>% head (
5,
)
top_cor_sum_stats = plot_cor_df %>% filter (
signal %in% top_cor_signals$signal,
) %>% group_by (
signal,
) %>% summarize (
min = min(value, na.rm=TRUE),
quart1 = quantile(value, 0.25, na.rm=TRUE),
med = median(value, na.rm=TRUE),
mean = mean(value, na.rm=TRUE),
quart3 = quantile(value, 0.75, na.rm=TRUE),
max = max(value, na.rm=TRUE),
) %>% ungroup (
)
## `summarise()` ungrouping output (override with `.groups` argument)
print('Symptoms with the largest all-time correlation:')
## [1] "Symptoms with the largest all-time correlation:"
print(top_cor_signals %>% left_join(top_cor_sum_stats, on='signal')
%>% select(-time_value, -value),
width=100)
## Joining, by = "signal"
## # A tibble: 5 x 8
## signal description min quart1
## <chr> <chr> <dbl> <dbl>
## 1 Viral pneumonia <NA> -1 -0.396
## 2 Anosmia loss of smell -0.0788 0.311
## 3 Ageusia loss of taste -0.423 0.185
## 4 Erythema chronicum migrans expanding rash early in lyme disease -0.710 -0.562
## 5 Photodermatitis allergic rash that reqs light -0.709 -0.461
## med mean quart3 max
## <dbl> <dbl> <dbl> <dbl>
## 1 -0.00360 -0.0600 0.295 0.668
## 2 0.556 0.499 0.721 0.833
## 3 0.478 0.412 0.672 0.797
## 4 -0.307 -0.344 -0.205 0.221
## 5 -0.343 -0.322 -0.172 0.282
plt = (ggplot(plot_cor_df)
+ geom_line(aes(x=time_value,
y=value,
group=factor(signal)),
data=plot_cor_df %>% filter (
!signal %in% top_cor_signals$signal
),
color='cornsilk',
size=0.1,
alpha=1.0)
+ geom_line(aes(x=time_value,
y=value,
group=factor(signal),
colour=factor(signal)
),
data=plot_cor_df %>% filter (
signal %in% top_cor_signals$signal,
),
#color='darkorange',
size=0.3)
+ ylab('rank correlation')
+ scale_x_date(breaks=lubridate::ymd(c('2020-03-01',
'2020-03-15', '2020-04-01', '2020-04-15', '2020-05-01',
'2020-05-15', '2020-06-01', '2020-06-15', '2020-07-01',
'2020-07-15', '2020-08-01', '2020-08-15',
'2020-09-01', '2020-09-15')))
+ theme(axis.text.x = element_text(angle = 45))
+ ggtitle("Top 5 signals by all-time max(|corr|)")
)
plt
## Warning: Removed 2502 row(s) containing missing values (geom_path).
## Warning: Removed 30 row(s) containing missing values (geom_path).
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Symptoms that consistently stay away from 0 correlation:"
## Joining, by = "signal"
## # A tibble: 5 x 8
## signal description min quart1 med mean quart3
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Podalgia pain in the foot -0.431 -0.270 -0.225 -0.232 -0.182
## 2 Restless legs syndrome <NA> -0.533 -0.406 -0.311 -0.308 -0.225
## 3 Hair loss <NA> 0.0440 0.261 0.320 0.305 0.372
## 4 Radiculopathy pinched nerve -0.477 -0.336 -0.255 -0.251 -0.158
## 5 Hyperpigmentation <NA> 0.0267 0.220 0.294 0.295 0.386
## max
## <dbl>
## 1 -0.0781
## 2 -0.0672
## 3 0.432
## 4 -0.0425
## 5 0.485
## Warning: Removed 2502 row(s) containing missing values (geom_path).
## Warning: Removed 30 row(s) containing missing values (geom_path).
## [1] "Podalgia" "Restless legs syndrome" "Hair loss"
## [4] "Radiculopathy" "Hyperpigmentation"
## character(0)
Interestingly, although anosmia technically is one of the “most stable” by our criterion, its distribution actually covers zero.
if (file.exists('geo_cor_df.RDS')) {
geo_cor_df = readRDS('geo_cor_df.RDS')
} else {
geo_cor_list = vector('list', length(symptom_names))
names(geo_cor_list) = symptom_names
for (symptom in symptom_names) {
cat(symptom, '...\n')
df_cor1 = covidcast_cor(symptom_df_act %>% filter(signal == symptom),
df_inum_act,
by = "geo_value",
method = "spearman")
df_cor1['signal'] = symptom
geo_cor_list[[symptom]] = df_cor1
}
geo_cor_df = bind_rows(geo_cor_list)
saveRDS(geo_cor_df, 'geo_cor_df.RDS')
}
geo_cor_df = geo_cor_df %>% left_join(
signal_description_df,
on='signal',
)
## Joining, by = "signal"
for (symptom in top_cor_signals$signal) {
df_cor2 = geo_cor_df %>% filter (signal == symptom)
df_cor2$time_value = min_available_time
df_cor2$issue = min_available_time
attributes(df_cor2)$geo_type = 'county'
class(df_cor2) = c("covidcast_signal", "data.frame")
n_available_county = df_cor2 %>% filter (!is.na(value)) %>% nrow()
# Plot choropleth maps, using the covidcast plotting functionality
title_text = sprintf("Correlations between cases and %s (%d counties)",
symptom, n_available_county)
if (!is.na(df_cor2$description[1])) {
title_text = paste0(title_text, '\n', sprintf('(%s)', df_cor2$description[1]))
}
print(plot(df_cor2,
title = title_text,
range = c(-1, 1), choro_col = c("orange","lightblue", "purple")))
}
for (symptom in top_min_cor$signal) {
df_cor2 = geo_cor_df %>% filter (signal == symptom)
df_cor2$time_value = min_available_time
df_cor2$issue = min_available_time
attributes(df_cor2)$geo_type = 'county'
class(df_cor2) = c("covidcast_signal", "data.frame")
n_available_county = df_cor2 %>% filter (!is.na(value)) %>% nrow()
# Plot choropleth maps, using the covidcast plotting functionality
title_text = sprintf("Correlations between cases and %s (%d counties)",
symptom, n_available_county)
if (!is.na(df_cor2$description[1])) {
title_text = paste0(title_text, '\n', sprintf('(%s)', df_cor2$description[1]))
}
print(plot(df_cor2,
title = title_text,
range = c(-1, 1), choro_col = c("orange","lightblue", "purple")))
}
TODO:
Some things I’m still worried about / next steps: